- Notifications
You must be signed in to change notification settings - Fork 849
/
Copy pathOML4Py Feature Extraction ESA.dsnb
executable file
·1 lines (1 loc) · 15.9 KB
/
OML4Py Feature Extraction ESA.dsnb
1
[{"layout":null,"template":null,"templateConfig":null,"name":"OML4Py Feature Extraction ESA","description":null,"readOnly":false,"type":"low","paragraphs":[{"col":0,"visualizationConfig":null,"hideInIFrame":false,"selectedVisualization":null,"title":null,"message":["%md"," "],"enabled":true,"result":null,"sizeX":0,"hideCode":true,"width":12,"hideResult":true,"dynamicFormParams":null,"row":0,"hasTitle":false,"hideVizConfig":true,"hideGutter":true,"relations":[],"forms":"[]"},{"col":0,"visualizationConfig":null,"hideInIFrame":false,"selectedVisualization":"html","title":null,"message":["%md","# Explicit Semantic Analysis (ESA) for Text Analytics","","Explicit Semantic Analysis (ESA) is an unsupervised algorithm for feature extraction of explicit features based on an existing knowledge base. It is the process of understanding the meaning of a section of text as a combination of the concepts found in the text. Compared to other techniques such as Latent Dirichlet Association (LDA) or Term Frequency-Inverse Document Frequency (TF-IDF), ESA offers some unique benefits. Most notably, it improves text document categorization by calculating `semantic relatedness` (how similar in meaning two words or pieces of text are to each other) between the documents and a set of topics that are explicitly defined and described by humans.","","The Oracle Machine Learning for Python function *oml.esa* extracts text-based features from a corpus of documents and performs document similarity comparisons. All processing occurs inside Oracle Autonomous Database.","","In this notebook, we cover the basic usage of ESA, where we extract topics from the comments provided. ",""," - Create a small sample dataset"," - Use the ESA Model to predict a feature vector on test data"," - Identify top features"," - Document similarity"," - Change model settings","","Copyright (c) 2024 Oracle Corporation ","###### <a href=\"https://oss.oracle.com/licenses/upl/\" onclick=\"return ! window.open('https://oss.oracle.com/licenses/upl/');\">The Universal Permissive License (UPL), Version 1.0<\/a>","---"],"enabled":true,"result":null,"sizeX":0,"hideCode":true,"width":12,"hideResult":false,"dynamicFormParams":null,"row":0,"hasTitle":false,"hideVizConfig":true,"hideGutter":true,"relations":[],"forms":"[]"},{"col":0,"visualizationConfig":null,"hideInIFrame":false,"selectedVisualization":"html","title":"For more information...","message":["%md","","* <a href=\"https://docs.oracle.com/en/cloud/paas/autonomous-data-warehouse-cloud/index.html\" target=\"_blank\">Oracle ADB Documentation<\/a>","* <a href=\"https://github.com/oracle-samples/oracle-db-examples/tree/main/machine-learning\" target=\"_blank\">OML folder on Oracle GitHub<\/a>","* <a href=\"https://www.oracle.com/machine-learning\" target=\"_blank\">OML Web Page<\/a>","* <a href=\"https://www.oracle.com/goto/ml-explicit-semantic-analysis\" target=\"_blank\">OML Explicit Semantic Analysis"],"enabled":true,"result":null,"sizeX":0,"hideCode":true,"width":12,"hideResult":false,"dynamicFormParams":null,"row":0,"hasTitle":true,"hideVizConfig":true,"hideGutter":true,"relations":[],"forms":"[]"},{"col":0,"visualizationConfig":null,"hideInIFrame":false,"selectedVisualization":"html","title":"Import libraries","message":["%python","","import oml","import pandas as pd"],"enabled":true,"result":null,"sizeX":0,"hideCode":false,"width":12,"hideResult":false,"dynamicFormParams":null,"row":0,"hasTitle":true,"hideVizConfig":false,"hideGutter":true,"relations":[],"forms":"[]"},{"col":0,"visualizationConfig":null,"hideInIFrame":false,"selectedVisualization":"html","title":"Create test and training data","message":["%python","","dat = oml.push(pd.DataFrame("," {'COMMENTS':['Aids in Africa: Planning for a long war',"," 'Mars rover maneuvers for rim shot',"," 'Mars express confirms presence of water at Mars south pole',"," 'NASA announces major Mars rover finding',"," 'Drug access, Asia threat in focus at AIDS summit',"," 'NASA Mars Odyssey THEMIS image: typical crater',"," 'Road blocks for Aids'],"," 'ID':[1,2,3,4,5,6,7]})).split(ratio=(0.7,0.3), seed = 1234)","","train_dat = dat[0]","test_dat = dat[1]"],"enabled":true,"result":null,"sizeX":0,"hideCode":false,"width":12,"hideResult":false,"dynamicFormParams":null,"row":0,"hasTitle":true,"hideVizConfig":false,"hideGutter":true,"relations":[],"forms":"[]"},{"col":0,"visualizationConfig":null,"hideInIFrame":false,"selectedVisualization":"table","title":"Check the test data","message":["%python","","z.show(test_dat)"],"enabled":true,"result":null,"sizeX":0,"hideCode":false,"width":6,"hideResult":false,"dynamicFormParams":null,"row":0,"hasTitle":true,"hideVizConfig":false,"hideGutter":true,"relations":[],"forms":"[]"},{"col":0,"visualizationConfig":null,"hideInIFrame":false,"selectedVisualization":"table","title":"Check the training data","message":["%python","","z.show(train_dat)"],"enabled":true,"result":null,"sizeX":0,"hideCode":false,"width":6,"hideResult":false,"dynamicFormParams":null,"row":0,"hasTitle":true,"hideVizConfig":false,"hideGutter":true,"relations":[],"forms":"[]"},{"col":0,"visualizationConfig":"[{\"raw\":{\"height\":300,\"lastColumns\":[],\"version\":1}}]","hideInIFrame":false,"selectedVisualization":"raw","title":"View the help file for ESA","message":["%python","","help(oml.esa)"],"enabled":true,"result":null,"sizeX":0,"hideCode":false,"width":12,"hideResult":false,"dynamicFormParams":null,"row":0,"hasTitle":true,"hideVizConfig":false,"hideGutter":true,"relations":[],"forms":"[]"},{"col":0,"visualizationConfig":null,"hideInIFrame":false,"selectedVisualization":"raw","title":"Create a policy for text indexing","message":["%script","","-- Drop the policy if it exists","BEGIN ctx_ddl.drop_policy('ESA_TXT_POLICY'); "," exception when others then NULL;","END;","","BEGIN ctx_ddl.create_policy('ESA_TXT_POLICY'); "," exception when others then NULL;","END;"],"enabled":true,"result":null,"sizeX":0,"hideCode":false,"width":12,"hideResult":false,"dynamicFormParams":null,"row":0,"hasTitle":true,"hideVizConfig":false,"hideGutter":true,"relations":[],"forms":"[]"},{"col":0,"visualizationConfig":null,"hideInIFrame":false,"selectedVisualization":"html","title":"Specify model settings","message":["%python","","odm_settings = {'odms_text_policy_name': 'ESA_TXT_POLICY',"," '\"ODMS_TEXT_MIN_DOCUMENTS\"': 1,"," '\"ESAS_MIN_ITEMS\"': 1}","","ctx_settings = {'COMMENTS':"," 'TEXT(POLICY_NAME:ESA_TXT_POLICY)(TOKEN_TYPE:STEM)'}"],"enabled":true,"result":null,"sizeX":0,"hideCode":false,"width":6,"hideResult":false,"dynamicFormParams":null,"row":0,"hasTitle":true,"hideVizConfig":false,"hideGutter":true,"relations":[],"forms":"[]"},{"col":0,"visualizationConfig":null,"hideInIFrame":false,"selectedVisualization":"raw","title":"Create an ESA model object ","message":["%python","","# Delete if ESA model exists","try:"," oml.drop(model = 'ESA_MOD')","except:"," print('No such model')"," ","esa_mod = oml.esa(**odm_settings)"],"enabled":true,"result":null,"sizeX":0,"hideCode":false,"width":6,"hideResult":false,"dynamicFormParams":null,"row":0,"hasTitle":true,"hideVizConfig":false,"hideGutter":true,"relations":[],"forms":"[]"},{"col":0,"visualizationConfig":null,"hideInIFrame":false,"selectedVisualization":"raw","title":"View the model settings","message":["%python","","esa_mod.settings"],"enabled":true,"result":null,"sizeX":0,"hideCode":false,"width":6,"hideResult":false,"dynamicFormParams":null,"row":0,"hasTitle":true,"hideVizConfig":false,"hideGutter":true,"relations":[],"forms":"[]"},{"col":0,"visualizationConfig":null,"hideInIFrame":false,"selectedVisualization":"html","title":" Fit the model according to the training data and parameter settings","message":["%python","","esa_mod = esa_mod.fit(train_dat, case_id='ID', ctx_settings = ctx_settings)"],"enabled":true,"result":null,"sizeX":0,"hideCode":false,"width":6,"hideResult":false,"dynamicFormParams":null,"row":0,"hasTitle":true,"hideVizConfig":false,"hideGutter":true,"relations":[],"forms":"[]"},{"col":0,"visualizationConfig":"[{\"raw\":{\"height\":300,\"lastColumns\":[],\"version\":1}}]","hideInIFrame":false,"selectedVisualization":"raw","title":"Show the model details","message":["%python","","esa_mod"],"enabled":true,"result":null,"sizeX":0,"hideCode":false,"width":12,"hideResult":false,"dynamicFormParams":null,"row":0,"hasTitle":true,"hideVizConfig":false,"hideGutter":true,"relations":[],"forms":"[]"},{"col":0,"visualizationConfig":"[{\"raw\":{\"height\":300,\"lastColumns\":[],\"version\":1}}]","hideInIFrame":false,"selectedVisualization":"raw","title":"Show the model details for each feature","message":["%python","","z.show(esa_mod.features.round(4))"],"enabled":true,"result":null,"sizeX":0,"hideCode":false,"width":12,"hideResult":false,"dynamicFormParams":null,"row":0,"hasTitle":true,"hideVizConfig":false,"hideGutter":true,"relations":[],"forms":"[]"},{"col":0,"visualizationConfig":null,"hideInIFrame":false,"selectedVisualization":"table","title":"Predicted top 2 features","message":["%python","","z.show(esa_mod.transform(test_dat, "," supplemental_cols = test_dat[:, ['ID', 'COMMENTS']], "," topN = 2).sort_values(by = ['ID']).round(4))"],"enabled":true,"result":null,"sizeX":0,"hideCode":false,"width":12,"hideResult":false,"dynamicFormParams":null,"row":0,"hasTitle":true,"hideVizConfig":false,"hideGutter":true,"relations":[],"forms":"[]"},{"col":0,"visualizationConfig":null,"hideInIFrame":false,"selectedVisualization":"table","title":"View the model name internal to the database","message":["%sql","","SELECT * from all_mining_models WHERE ALGORITHM='EXPLICIT_SEMANTIC_ANALYS'"],"enabled":true,"result":null,"sizeX":0,"hideCode":false,"width":12,"hideResult":false,"dynamicFormParams":null,"row":0,"hasTitle":true,"hideVizConfig":false,"hideGutter":true,"relations":[],"forms":"[]"},{"col":0,"visualizationConfig":null,"hideInIFrame":false,"selectedVisualization":"html","title":"Rename the model in the database","message":["%python","","esa_mod.model_name='ESA_MOD'"],"enabled":true,"result":null,"sizeX":0,"hideCode":false,"width":12,"hideResult":false,"dynamicFormParams":null,"row":0,"hasTitle":true,"hideVizConfig":false,"hideGutter":true,"relations":[],"forms":"[]"},{"col":0,"visualizationConfig":null,"hideInIFrame":false,"selectedVisualization":"table","title":"View new model name in the database","message":["%sql","","select * from USER_MINING_MODELS where model_name='ESA_MOD'"],"enabled":true,"result":null,"sizeX":0,"hideCode":false,"width":12,"hideResult":false,"dynamicFormParams":null,"row":0,"hasTitle":true,"hideVizConfig":false,"hideGutter":true,"relations":[],"forms":"[]"},{"col":0,"visualizationConfig":"[{\"raw\":{\"height\":300,\"lastColumns\":[],\"version\":1}}]","hideInIFrame":false,"selectedVisualization":"raw","title":"View renamed model in Python","message":["%python","","oml.esa(model_name = 'ESA_MOD') "],"enabled":true,"result":null,"sizeX":0,"hideCode":false,"width":12,"hideResult":false,"dynamicFormParams":null,"row":0,"hasTitle":true,"hideVizConfig":false,"hideGutter":true,"relations":[],"forms":"[]"},{"col":0,"visualizationConfig":null,"hideInIFrame":false,"selectedVisualization":"table","title":"Make predictions on the test data","message":["%python","","esa_pred = esa_mod.predict(test_dat, supplemental_cols = test_dat[:, ['ID', 'COMMENTS']])","z.show(esa_pred)"],"enabled":true,"result":null,"sizeX":0,"hideCode":false,"width":12,"hideResult":false,"dynamicFormParams":null,"row":0,"hasTitle":true,"hideVizConfig":false,"hideGutter":true,"relations":[],"forms":"[]"},{"col":0,"visualizationConfig":null,"hideInIFrame":false,"selectedVisualization":"html","title":"Change the settings and refit the model","message":["%python","","new_setting = {'ESAS_VALUE_THRESHOLD': '0.01',"," 'ODMS_TEXT_MAX_FEATURES': '2',"," 'FEAT_NUM_FEATURES': '2',"," 'ESAS_TOPN_FEATURES': '2'}"," "],"enabled":true,"result":null,"sizeX":0,"hideCode":false,"width":12,"hideResult":false,"dynamicFormParams":null,"row":0,"hasTitle":true,"hideVizConfig":false,"hideGutter":true,"relations":[],"forms":"[]"},{"col":0,"visualizationConfig":null,"hideInIFrame":false,"selectedVisualization":null,"title":"Refit the model with new settings - 2 features maximum ","message":["%python","","# Delete if ESA model exists","try:"," oml.drop(model = 'ESA_MOD_2')","except:"," print('No such model')"," ","esa_mod.set_params(**new_setting).fit(train_dat, case_id = 'ID', "," ctx_settings = ctx_settings,"," model_name='ESA_MOD_2')","","z.show(esa_mod.feature_compare(test_dat,"," compare_cols = ['COMMENTS'],"," supplemental_cols = ['COMMENTS']))"],"enabled":true,"result":null,"sizeX":0,"hideCode":false,"width":12,"hideResult":false,"dynamicFormParams":null,"row":0,"hasTitle":true,"hideVizConfig":false,"hideGutter":true,"relations":[],"forms":"[]"},{"col":0,"visualizationConfig":null,"hideInIFrame":false,"selectedVisualization":"table","title":"List ESA model views","message":["%sql","","SELECT view_name, view_type FROM user_mining_model_views","WHERE model_name='ESA_MOD_2'","ORDER BY view_name;"],"enabled":true,"result":null,"sizeX":0,"hideCode":false,"width":12,"hideResult":false,"dynamicFormParams":null,"row":0,"hasTitle":true,"hideVizConfig":false,"hideGutter":true,"relations":[],"forms":"[]"},{"col":0,"visualizationConfig":null,"hideInIFrame":false,"selectedVisualization":"html","title":"The VA view shows associations among features (topics) and words","message":["%md","","*ATTRIBUTE_SUBNAME* refers to words, or tokens. The strength of the associations is measured by *COEFFICIENT*. Higher coefficient values indicate a stronger association."],"enabled":true,"result":null,"sizeX":0,"hideCode":true,"width":12,"hideResult":false,"dynamicFormParams":null,"row":0,"hasTitle":true,"hideVizConfig":true,"hideGutter":true,"relations":[],"forms":"[]"},{"col":0,"visualizationConfig":null,"hideInIFrame":false,"selectedVisualization":"table","title":null,"message":["%sql","","SELECT * from dm$vaesa_mod_2 FETCH FIRST 10 ROWS ONLY"],"enabled":true,"result":null,"sizeX":0,"hideCode":false,"width":12,"hideResult":false,"dynamicFormParams":null,"row":0,"hasTitle":false,"hideVizConfig":false,"hideGutter":true,"relations":[],"forms":"[]"},{"col":0,"visualizationConfig":null,"hideInIFrame":false,"selectedVisualization":"table","title":"Show the associations for selected words","message":["%sql","","SELECT ATTRIBUTE_SUBNAME, count(ATTRIBUTE_SUBNAME) FROM dm$vaesa_mod_2","WHERE ATTRIBUTE_SUBNAME='MARS'","GROUP BY ATTRIBUTE_SUBNAME"],"enabled":true,"result":null,"sizeX":0,"hideCode":false,"width":12,"hideResult":false,"dynamicFormParams":null,"row":0,"hasTitle":true,"hideVizConfig":false,"hideGutter":true,"relations":[],"forms":"[]"},{"col":0,"visualizationConfig":null,"hideInIFrame":false,"selectedVisualization":"html","title":"Clean-up","message":["%python","","try:"," oml.drop(model = 'ESA_MOD')","except:"," print('No such model')","","try:"," oml.drop(model = 'ESA_MOD_2')","except:"," print('No such model')"," "],"enabled":true,"result":null,"sizeX":0,"hideCode":false,"width":12,"hideResult":false,"dynamicFormParams":null,"row":0,"hasTitle":true,"hideVizConfig":false,"hideGutter":true,"relations":[],"forms":"[]"},{"col":0,"visualizationConfig":null,"hideInIFrame":false,"selectedVisualization":"html","title":null,"message":["%md","","## End of Script"],"enabled":true,"result":null,"sizeX":0,"hideCode":true,"width":12,"hideResult":false,"dynamicFormParams":null,"row":0,"hasTitle":false,"hideVizConfig":true,"hideGutter":true,"relations":[],"forms":"[]"},{"col":0,"visualizationConfig":null,"hideInIFrame":false,"selectedVisualization":"html","title":null,"message":["%md"],"enabled":true,"result":null,"sizeX":0,"hideCode":true,"width":12,"hideResult":true,"dynamicFormParams":null,"row":0,"hasTitle":false,"hideVizConfig":true,"hideGutter":true,"relations":[],"forms":"[]"}],"version":"6","snapshot":false,"tags":null}]